{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "29489ded",
   "metadata": {
    "id": "29489ded"
   },
   "source": [
    "# How to evaluate llm with text descriptors?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "997d80b6",
   "metadata": {
    "id": "997d80b6"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from datetime import datetime\n",
    "from datetime import time\n",
    "from datetime import timedelta\n",
    "\n",
    "import requests\n",
    "from io import BytesIO\n",
    "\n",
    "from sklearn import datasets, ensemble, model_selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dcacbf00",
   "metadata": {
    "id": "dcacbf00"
   },
   "outputs": [],
   "source": [
    "from evidently.ui.workspace.cloud import CloudWorkspace\n",
    "\n",
    "from evidently import ColumnMapping\n",
    "from evidently.report import Report\n",
    "from evidently.test_suite import TestSuite\n",
    "\n",
    "from evidently.metrics import ColumnSummaryMetric, ColumnDistributionMetric, ColumnDriftMetric, DataDriftTable, TextDescriptorsDistribution, ColumnCategoryMetric\n",
    "from evidently.tests import TestColumnValueMin, TestColumnValueMean, TestCategoryShare, TestShareOfOutRangeValues\n",
    "\n",
    "from evidently.metric_preset import DataDriftPreset, DataQualityPreset, TextOverviewPreset, TextEvals\n",
    "\n",
    "from evidently.descriptors import HuggingFaceModel, HuggingFaceToxicityModel, OpenAIPrompting \n",
    "from evidently.descriptors import RegExp, BeginsWith, EndsWith, Contains, DoesNotContain, IncludesWords, ExcludesWords\n",
    "from evidently.descriptors import TextLength, OOV, NonLetterCharacterPercentage, SentenceCount, WordCount, Sentiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9640360",
   "metadata": {
    "id": "a9640360",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "nltk.download('words')\n",
    "nltk.download('wordnet')\n",
    "nltk.download('omw-1.4')\n",
    "nltk.download('vader_lexicon')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d0dd05c-1a80-4d51-84dd-93f1776f1352",
   "metadata": {},
   "source": [
    "# Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2038e2b-acdb-4844-a759-48170f1f539b",
   "metadata": {},
   "outputs": [],
   "source": [
    "response = requests.get(\"https://raw.githubusercontent.com/evidentlyai/evidently/main/examples/how_to_questions/chat_df.csv\")\n",
    "csv_content = BytesIO(response.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6dcad6c8-9abb-4c66-aa58-4e370baef072",
   "metadata": {},
   "outputs": [],
   "source": [
    "assistant_logs = pd.read_csv(csv_content, index_col=0, parse_dates=['start_time', 'end_time'])\n",
    "assistant_logs.index = assistant_logs.start_time\n",
    "assistant_logs.index.rename('index', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82975800-0f47-458e-a43e-43d4accbb446",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_colwidth', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f16eb76-9100-4a67-a1a3-bb836d4c4b7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "assistant_logs[[\"question\", \"response\"]].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96f0677b-511d-4254-b6f8-972edd21e1ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "assistant_logs.iloc[6].question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db78b522-deab-489e-bd24-501c629d019b",
   "metadata": {},
   "outputs": [],
   "source": [
    "assistant_logs.iloc[6].response"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efe64a1b-a445-40ab-8125-7a7cf5e703ae",
   "metadata": {},
   "source": [
    "# One-off reports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1de2ddab-24a9-49ef-a1c8-b3e5121efce5",
   "metadata": {},
   "outputs": [],
   "source": [
    "column_mapping = ColumnMapping(\n",
    "    datetime='start_time',\n",
    "    datetime_features=['end_time'],\n",
    "    text_features=['question', 'response'],\n",
    "    categorical_features=['organization', 'model_ID', 'region', 'environment', 'feedback'],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1e9da172-14aa-4614-96df-9ba57170ae28",
   "metadata": {},
   "source": [
    "### Simple descriptors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed5c518a-e4b9-4911-879e-9fd008cab51b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Built-in descriptors without parameters\n",
    "report = Report(metrics=[\n",
    "    TextEvals(column_name=\"question\", descriptors=[\n",
    "        Sentiment(display_name=\"Question sentiment\"),\n",
    "        TextLength(display_name= \"Question length\"),\n",
    "        OOV(display_name= \"Question out of vocabulary words\")\n",
    "    ]),\n",
    "    TextEvals(column_name=\"response\", descriptors=[\n",
    "        Sentiment(display_name=\"Response sentiment\"),\n",
    "        NonLetterCharacterPercentage(display_name=\"Non letter characters in response\"),\n",
    "        SentenceCount(display_name=\"Sentence count in response\"),\n",
    "        WordCount(display_name=\"Word count in response\")\n",
    "    ])\n",
    "])\n",
    "\n",
    "report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)], \n",
    "           current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)], \n",
    "           column_mapping=column_mapping)\n",
    "report    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ad83213-f387-40c3-81d4-6c8c428b85cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Built-in descriptors with parameters\n",
    "report = Report(metrics=[\n",
    "    TextEvals(column_name=\"question\", descriptors=[\n",
    "        BeginsWith(display_name=\"'How' question\", prefix=\"How\"),\n",
    "        RegExp(reg_exp=r\"^I\", display_name= \"Question begins with 'I'\"),\n",
    "        IncludesWords(words_list=['invoice', 'salary'], display_name=\"Questions about invoices and salary\")\n",
    "    ]),\n",
    "    TextEvals(column_name=\"response\", descriptors=[\n",
    "        EndsWith(display_name=\"Assisrance might be needed\", suffix=\"for assistance.\"),\n",
    "        ExcludesWords(words_list=['wrong', 'mistake'], display_name=\"Responses without mention of mistakes\"),\n",
    "        Contains(items=['medical leave'], display_name=\"contains 'medical leave'\"),\n",
    "        DoesNotContain(items=['employee portal'], display_name=\"does not contain 'employee portal'\")\n",
    "    ])\n",
    "])\n",
    "\n",
    "report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)], \n",
    "           current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)], \n",
    "           column_mapping=column_mapping)\n",
    "report    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "53d28599-0aae-4b5f-9940-db5b7af07625",
   "metadata": {},
   "source": [
    "### Model-based descriptors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71c99066-e9f9-4922-a95a-3217a8aeec21",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Descriptors with Hugging Face models\n",
    "report = Report(metrics=[\n",
    "    TextEvals(column_name=\"response\", descriptors=[\n",
    "        HuggingFaceModel(model=\"DaNLP/da-electra-hatespeech-detection\", display_name=\"Hugging Face Toxicity for response\"),\n",
    "        HuggingFaceModel(model=\"SamLowe/roberta-base-go_emotions\", params={\"label\": \"disappointment\"}, \n",
    "                         display_name=\"Hugging Face Disappointment for response\"), \n",
    "        HuggingFaceModel(model=\"SamLowe/roberta-base-go_emotions\", params={\"label\": \"optimism\"}, \n",
    "                         display_name=\"Hugging Face Optimism for response\"),\n",
    "        HuggingFaceModel(model=\"MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli\", params={\"labels\": [\"HR\", \"finance\"], \"threshold\":0.5}, \n",
    "                         display_name=\"Hugging Face Topic\"), \n",
    "        HuggingFaceModel(model=\"lakshyakh93/deberta_finetuned_pii\", params={\"threshold\": 0.6},\n",
    "                        display_name=\"Hugging Face PII for respone\"),\n",
    "    ])\n",
    "])\n",
    "\n",
    "report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)], \n",
    "           current_data= assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)], \n",
    "           column_mapping=column_mapping)\n",
    "\n",
    "report    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf603478-b84f-4088-b2d4-1a5595284041",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Simplified descriptors for widely-used Hugging Face models \n",
    "report = Report(metrics=[\n",
    "    TextEvals(column_name=\"response\", descriptors=[\n",
    "        HuggingFaceToxicityModel(toxic_label=\"hate\"),\n",
    "    ])\n",
    "])\n",
    "\n",
    "report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)], \n",
    "           current_data= assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)], \n",
    "           column_mapping=column_mapping)\n",
    "\n",
    "report"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e1b0f0fb-80eb-436e-83d5-508f274b4f41",
   "metadata": {},
   "source": [
    "### LLM-based descriptors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ce02433-b1c4-4c6f-8bec-73ed5ea01d4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "pii_prompt = \"\"\"\n",
    "Personally identifiable information (PII) is information that, when used alone or with other relevant data, can identify an individual.\n",
    "\n",
    "PII may contain direct identifiers (e.g., passport information) that can identify a person uniquely, \n",
    "or quasi-identifiers (e.g., race) that can be combined with other quasi-identifiers (e.g., date of birth) to successfully recognize an individual.\n",
    "PII may contain person's name, person's address,and something I may forget to mention\n",
    "\n",
    "Please identify whether or not the above text contains PII\n",
    "\n",
    "text: REPLACE \n",
    "\n",
    "Use the following categories for PII identification:\n",
    "1 if text contains PII\n",
    "0 if text does not contain PII\n",
    "0 if the information provided is not sufficient to make a clear determination\n",
    "\n",
    "Retrun a category only\n",
    "\"\"\"\n",
    "\n",
    "negativity_prompt = \"\"\"\n",
    "Classify text into two groups: negative and positive\n",
    "\n",
    "text: REPLACE \n",
    "\n",
    "Use the following categories for classification:\n",
    "NEGATIVE if text is negative\n",
    "POSITIVE if text is NOT negative\n",
    "UNKNOWN use this category only if the information provided is not sufficient to make a clear determination\n",
    "\n",
    "Retrun only category\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ac74c17-7bb7-448e-96a0-af8898225cfd",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Descriptors with external models\n",
    "#to run OpenAIPrompting descriptor make sure you set environement variable with openai token \n",
    "report = Report(metrics=[\n",
    "    TextEvals(column_name=\"response\", descriptors=[\n",
    "        OpenAIPrompting(prompt=pii_prompt, prompt_replace_string=\"REPLACE\", model=\"gpt-3.5-turbo-instruct\", feature_type=\"num\", display_name=\"PII for response (by gpt3.5)\"),\n",
    "        OpenAIPrompting(prompt=negativity_prompt, prompt_replace_string=\"REPLACE\", model=\"gpt-3.5-turbo-instruct\", feature_type=\"cat\", display_name=\"Negativity for response (by gpt3.5)\")       \n",
    "    ])\n",
    "])\n",
    "\n",
    "report.run(reference_data= None, #assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)], \n",
    "           current_data= assistant_logs[:20], #assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)], \n",
    "           column_mapping=column_mapping)\n",
    "\n",
    "report    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ba4ac83b-4d07-4050-95fa-45009ab5aa1d",
   "metadata": {},
   "source": [
    "## Get dataset with calculated descriptors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88a549cc-79a2-4862-9a8b-994762ea40af",
   "metadata": {},
   "outputs": [],
   "source": [
    "#reference dataset enriched with descriptors\n",
    "report.datasets().reference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5e7de13-d278-4871-b32d-7934763c6712",
   "metadata": {},
   "outputs": [],
   "source": [
    "#current dataset enriched with descriptors\n",
    "report.datasets().current"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e82accf-59e4-45d2-8a99-fa8caf325c86",
   "metadata": {},
   "source": [
    "# One-off Test Suits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3729b450-9cf2-408b-943e-ff3d738baba3",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_suite = TestSuite(tests=[\n",
    "    TestColumnValueMin(column_name = Sentiment().on(\"response\"), gt=0),\n",
    "    TestCategoryShare(column_name = \"feedback\", category=\"downvote\", lt=0.1),\n",
    "    TestCategoryShare(column_name = IncludesWords(words_list=['salary']).on(\"response\"), category=\"False\", lt=0.1), \n",
    "])\n",
    "\n",
    "test_suite.run(reference_data=None, current_data=assistant_logs[:20])\n",
    "test_suite"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8644398d-3c22-43cb-b0e8-136ef8c7abbc",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_suite.datasets().current"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2e5da017-975b-420f-863e-dbc25d82c458",
   "metadata": {},
   "source": [
    "# Monitoring"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ca58db3-f354-4c2f-87f3-ef9ba9a113a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from evidently.ui.workspace.cloud import CloudWorkspace\n",
    "from evidently.ui.dashboards import DashboardPanelTestSuite, ReportFilter, TestSuitePanelType\n",
    "from evidently.renderers.html_widgets import WidgetSize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "283ab028-c7b3-43d2-8271-5232f92f9d57",
   "metadata": {},
   "outputs": [],
   "source": [
    "ws = CloudWorkspace(\n",
    "    \ttoken=\"YOUR_TOKEN_HERE\",\n",
    "    \turl=\"https://app.evidently.cloud/\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "151cffdb-3ff6-4553-beb2-9e6af16275c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "project = ws.create_project(\"Project title\", team_id=\"YOUR_TEAM_ID_HERE\")\n",
    "project.description = \"Project description\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39179e4a-d41c-4eef-8a5f-134d24cfa8fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_test_suite(i: int):\n",
    "    test_suite = TestSuite(\n",
    "        tests=[\n",
    "            TestColumnValueMin(column_name=TextLength().on(\"response\"), gt=100),\n",
    "            TestShareOfOutRangeValues(column_name=TextLength().on(\"question\"), left=30, right=100, lt=0.1),\n",
    "            TestColumnValueMin(column_name=Sentiment().on(\"response\"), gt=0),\n",
    "            TestColumnValueMean(column_name=OOV().on(\"response\"), lt=15),\n",
    "            TestCategoryShare(column_name = \"feedback\", category=\"downvote\", lt=0.1),\n",
    "            TestCategoryShare(column_name = IncludesWords(words_list=['salary']).on(\"response\"), category=False, lt=0.1), \n",
    "        ],\n",
    "        timestamp=datetime.now() + timedelta(days=i),\n",
    "    )\n",
    "    test_suite.run(reference_data=None, current_data=assistant_logs.iloc[20 * i : 20 * (i + 1), :], column_mapping=column_mapping)\n",
    "    return test_suite"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12fe69ac-8f15-40be-bd20-185bef699b50",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_report(i: int):\n",
    "    report = Report(metrics=[\n",
    "        TextEvals(column_name=\"question\", descriptors=[\n",
    "            Sentiment(display_name=\"Question sentiment\"),\n",
    "            TextLength(display_name= \"Question length\"),\n",
    "            OOV(display_name= \"Question out of vocabulary words\"),\n",
    "        ]),\n",
    "        TextEvals(column_name=\"response\", descriptors=[\n",
    "            Sentiment(display_name=\"Response sentiment\"),\n",
    "            NonLetterCharacterPercentage(display_name=\"Non letter characters in response\"),\n",
    "            SentenceCount(display_name=\"Sentence count in response\"),\n",
    "            WordCount(display_name=\"Word count in response\"),\n",
    "        ]),\n",
    "        ColumnCategoryMetric(column_name=IncludesWords(words_list=['salary']).for_column(\"response\"), category=True),\n",
    "    ],\n",
    "        timestamp=datetime.now() + timedelta(days=i),\n",
    "                   )\n",
    "    \n",
    "    report.run(reference_data=None, current_data=assistant_logs.iloc[20 * i : 20 * (i + 1), :], column_mapping=column_mapping)\n",
    "    return report    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "247392a4-414f-4899-a2a0-09cb56698f33",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(0, 5):\n",
    "        test_suite = create_test_suite(i=i)\n",
    "        ws.add_test_suite(project.id, test_suite)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1c6e042-4b5d-400d-98a3-7a263943881e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for i in range(0, 5):\n",
    "        report = create_report(i=i)\n",
    "        ws.add_report(project.id, report)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d4e6c3d-08e2-4ffd-8b9d-50e13ffb643a",
   "metadata": {},
   "outputs": [],
   "source": [
    "project.dashboard.add_panel(\n",
    "    DashboardPanelTestSuite(\n",
    "        title=\"Test results\",\n",
    "        filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=True),\n",
    "        size=WidgetSize.FULL,\n",
    "    )\n",
    ")\n",
    "project.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb052be8-2a74-4fc2-a041-c43de294ce44",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
